In [1]:
%load_ext autoreload
%autoreload 2
In [2]:
import os
from datetime import datetime, timedelta
from pathlib import Path

import numpy
import pandas
from sqlalchemy import select, func

from bsky_topics.config import Config
from bsky_topics.db import configure_db, async_session
from bsky_topics.db.schema import Post, PostEmbedding
from bsky_topics.topics import compute_topics, get_indexed_posts_for_date_range
In [3]:
CONFIG_FILE = "../env.toml"
In [4]:
config = Config.load(CONFIG_FILE)
configure_db(config.db_url)
Out[4]:
<sqlalchemy.ext.asyncio.engine.AsyncEngine at 0x16a5efc90>
In [5]:
from bertopic import BERTopic

loaded_model = BERTopic.load("saved_models/2024-12-01 180000")
In [6]:
async with async_session() as session:
    curr_date = datetime(year=2024, month=12, day=1, hour=18)
    block_end = curr_date + timedelta(hours=3)

    stmt = (select(Post.post_text)
            .join(PostEmbedding)
            .filter(Post.indexed_at >= curr_date, Post.indexed_at < block_end))

    post_texts = []
    for post_text in await session.execute(stmt):
        post_texts.append(post_text[0])

hierarchical_topics = loaded_model.hierarchical_topics(post_texts)
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4505/4505 [00:33<00:00, 133.25it/s]
In [7]:
loaded_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics, top_n_topics=50)
In [8]:
topics_with_at_least50 = {t for t, count in loaded_model.topic_sizes_.items() if count >= 50 and t >= 0}
In [9]:
async with async_session() as session:
    curr_date = datetime(year=2024, month=12, day=1, hour=18)
    block_end = curr_date + timedelta(hours=3)

    stmt = (select(Post.post_text, Post.indexed_at)
            .join(PostEmbedding)
            .filter(Post.indexed_at >= curr_date, Post.indexed_at < block_end))

    post_texts = []
    timestamps = []
    topics = []
    for i, (post_text, timestamp) in enumerate(await session.execute(stmt)):
        if loaded_model.topics_[i] in topics_with_at_least50:
            post_texts.append(post_text)
            timestamps.append(timestamp)
            topics.append(loaded_model.topics_[i])
In [10]:
print(len(topics))
187760
In [11]:
loaded_model.verbose = True
topics_over_time = loaded_model.topics_over_time(post_texts, timestamps, topics=topics, nr_bins=20, global_tuning=False, evolution_tuning=False)
18it [00:05,  3.30it/s]
In [13]:
loaded_model.visualize_topics_over_time(topics_over_time, top_n_topics=50)
In [14]:
loaded_model.visualize_topics(top_n_topics=50)
[autoreload of bsky_topics.db.schema failed: Traceback (most recent call last):
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 276, in check
    superreload(m, reload, self.old_objects)
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 500, in superreload
    update_generic(old_obj, new_obj)
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 397, in update_generic
    update(a, b)
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/IPython/extensions/autoreload.py", line 335, in update_class
    if (old_obj == new_obj) is True:
        ^^^^^^^^^^^^^^^^^^
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/sql/operators.py", line 582, in __eq__
    return self.operate(eq, other)
           ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/orm/attributes.py", line 453, in operate
    return op(self.comparator, *other, **kwargs)  # type: ignore[no-any-return]  # noqa: E501
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/orm/relationships.py", line 762, in __eq__
    self.property._optimized_compare(
  File "/Users/lucas/Projects/atproto/bsky_topics/.venv/lib/python3.12/site-packages/sqlalchemy/orm/relationships.py", line 1185, in _optimized_compare
    raise sa_exc.ArgumentError(
sqlalchemy.exc.ArgumentError: Mapped instance expected for relationship comparison to object.   Classes, queries and other SQL elements are not accepted in this context; for comparison with a subquery, use PostEmbedding.post.has(**criteria).
]
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
In [ ]: